{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方模块\n",
    "import pandas as pd\n",
    "# 读入数据\n",
    "Titanic = pd.read_csv(r'C:\\Users\\Administrator\\Desktop\\Titanic.csv')\n",
    "Titanic.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 删除无意义的变量，并检查剩余自字是否含有缺失值\n",
    "Titanic.drop(['PassengerId','Name','Ticket','Cabin'], axis = 1, inplace = True)\n",
    "Titanic.isnull().sum(axis = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 对Sex分组，用各组乘客的平均年龄填充各组中的缺失年龄\n",
    "fillna_Titanic = []\n",
    "for i in Titanic.Sex.unique():\n",
    "    update = Titanic.loc[Titanic.Sex == i,].fillna(value = {'Age': Titanic.Age[Titanic.Sex == i].mean()}, inplace = True)\n",
    "    fillna_Titanic.append(update)\n",
    "Titanic = pd.concat(fillna_Titanic)\n",
    "# 使用Embarked变量的众数填充缺失值\n",
    "Titanic.fillna(value = {'Embarked':Titanic.Embarked.mode()[0]}, inplace=True)\n",
    "Titanic.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 将数值型的Pclass转换为类别型，否则无法对其哑变量处理\n",
    "Titanic.Pclass = Titanic.Pclass.astype('category')\n",
    "# 哑变量处理\n",
    "dummy = pd.get_dummies(Titanic[['Sex','Embarked','Pclass']])\n",
    "# 水平合并Titanic数据集和哑变量的数据集\n",
    "Titanic = pd.concat([Titanic,dummy], axis = 1)\n",
    "# 删除原始的Sex、Embarked和Pclass变量\n",
    "Titanic.drop(['Sex','Embarked','Pclass'], inplace=True, axis = 1)\n",
    "Titanic.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方包\n",
    "from sklearn import model_selection\n",
    "# 取出所有自变量名称\n",
    "predictors = Titanic.columns[1:]\n",
    "# 将数据集拆分为训练集和测试集，且测试集的比例为25%\n",
    "X_train, X_test, y_train, y_test = model_selection.train_test_split(Titanic[predictors], Titanic.Survived, \n",
    "                                                                    test_size = 0.25, random_state = 1234)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方模块\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn import tree\n",
    "# 预设各参数的不同选项值\n",
    "max_depth = [2,3,4,5,6]\n",
    "min_samples_split = [2,4,6,8]\n",
    "min_samples_leaf = [2,4,8,10,12]\n",
    "# 将各参数值以字典形式组织起来\n",
    "parameters = {'max_depth':max_depth, 'min_samples_split':min_samples_split, 'min_samples_leaf':min_samples_leaf}\n",
    "# 网格搜索法，测试不同的参数值\n",
    "grid_dtcateg = GridSearchCV(estimator = tree.DecisionTreeClassifier(), param_grid = parameters, cv=10)\n",
    "# 模型拟合\n",
    "grid_dtcateg.fit(X_train, y_train)\n",
    "# 返回最佳组合的参数值\n",
    "grid_dtcateg.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方模块\n",
    "from sklearn import metrics\n",
    "# 构建分类决策树\n",
    "CART_Class = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf = 4, min_samples_split=2)\n",
    "# 模型拟合\n",
    "decision_tree = CART_Class.fit(X_train, y_train)\n",
    "# 模型在测试集上的预测\n",
    "pred = CART_Class.predict(X_test)\n",
    "# 模型的准确率\n",
    "print('模型在测试集的预测准确率：\\n',metrics.accuracy_score(y_test, pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方包\n",
    "import matplotlib.pyplot as plt\n",
    "y_score = CART_Class.predict_proba(X_test)[:,1]\n",
    "fpr,tpr,threshold = metrics.roc_curve(y_test, y_score)\n",
    "# 计算AUC的值\n",
    "roc_auc = metrics.auc(fpr,tpr)\n",
    "\n",
    "# 绘制面积图\n",
    "plt.stackplot(fpr, tpr, color='steelblue', alpha = 0.5, edgecolor = 'black')\n",
    "# 添加边际线\n",
    "plt.plot(fpr, tpr, color='black', lw = 1)\n",
    "# 添加对角线\n",
    "plt.plot([0,1],[0,1], color = 'red', linestyle = '--')\n",
    "# 添加文本信息\n",
    "plt.text(0.5,0.3,'ROC curve (area = %0.2f)' % roc_auc)\n",
    "# 添加x轴与y轴标签\n",
    "plt.xlabel('1-Specificity')\n",
    "plt.ylabel('Sensitivity')\n",
    "# 显示图形\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 需要在电脑中安装Graphviz\n",
    "# https://graphviz.gitlab.io/_pages/Download/Download_windows.html\n",
    "# 然后将解压文件中的bin设置到环境变量中\n",
    "# 导入第三方模块\n",
    "from sklearn.tree import export_graphviz\n",
    "from IPython.display import Image\n",
    "import pydotplus\n",
    "from sklearn.externals.six import StringIO\n",
    "# 绘制决策树\n",
    "dot_data = StringIO()\n",
    "export_graphviz(\n",
    "    decision_tree,\n",
    "    out_file=dot_data,  \n",
    "    feature_names=predictors,\n",
    "    class_names=['Unsurvived','Survived'],  \n",
    "    # filled=True,\n",
    "    rounded=True,  \n",
    "    special_characters=True\n",
    ")\n",
    "# 决策树展现\n",
    "graph = pydotplus.graph_from_dot_data(dot_data.getvalue())\n",
    "Image(graph.create_png()) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方包\n",
    "from sklearn import ensemble\n",
    "# 构建随机森林\n",
    "RF_class = ensemble.RandomForestClassifier(n_estimators=200, random_state=1234)\n",
    "# 随机森林的拟合\n",
    "RF_class.fit(X_train, y_train)\n",
    "# 模型在测试集上的预测\n",
    "RFclass_pred = RF_class.predict(X_test)\n",
    "# 模型的准确率\n",
    "print('模型在测试集的预测准确率：\\n',metrics.accuracy_score(y_test, RFclass_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 计算绘图数据\n",
    "y_score = RF_class.predict_proba(X_test)[:,1]\n",
    "fpr,tpr,threshold = metrics.roc_curve(y_test, y_score)\n",
    "roc_auc = metrics.auc(fpr,tpr)\n",
    "# 绘图\n",
    "plt.stackplot(fpr, tpr, color='steelblue', alpha = 0.5, edgecolor = 'black')\n",
    "plt.plot(fpr, tpr, color='black', lw = 1)\n",
    "plt.plot([0,1],[0,1], color = 'red', linestyle = '--')\n",
    "plt.text(0.5,0.3,'ROC curve (area = %0.2f)' % roc_auc)\n",
    "plt.xlabel('1-Specificity')\n",
    "plt.ylabel('Sensitivity')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 变量的重要性程度值\n",
    "importance = RF_class.feature_importances_\n",
    "# 构建含序列用于绘图\n",
    "Impt_Series = pd.Series(importance, index = X_train.columns)\n",
    "# 对序列排序绘图\n",
    "Impt_Series.sort_values(ascending = True).plot('barh')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 读入数据\n",
    "NHANES = pd.read_excel(r'C:\\Users\\Administrator\\Desktop\\NHANES.xlsx')\n",
    "NHANES.head()\n",
    "print(NHANES.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 取出自变量名称\n",
    "predictors = NHANES.columns[:-1]\n",
    "# 将数据集拆分为训练集和测试集\n",
    "X_train, X_test, y_train, y_test = model_selection.train_test_split(NHANES[predictors], NHANES.CKD_epi_eGFR, \n",
    "                                                                    test_size = 0.25, random_state = 1234)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 预设各参数的不同选项值\n",
    "max_depth = [18,19,20,21,22]\n",
    "min_samples_split = [2,4,6,8]\n",
    "min_samples_leaf = [2,4,8]\n",
    "parameters = {'max_depth':max_depth, 'min_samples_split':min_samples_split, 'min_samples_leaf':min_samples_leaf}\n",
    "# 网格搜索法，测试不同的参数值\n",
    "grid_dtreg = GridSearchCV(estimator = tree.DecisionTreeRegressor(), param_grid = parameters, cv=10)\n",
    "# 模型拟合\n",
    "grid_dtreg.fit(X_train, y_train)\n",
    "# 返回最佳组合的参数值\n",
    "grid_dtreg.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 构建用于回归的决策树\n",
    "CART_Reg = tree.DecisionTreeRegressor(max_depth = 20, min_samples_leaf = 2, min_samples_split = 4)\n",
    "# 回归树拟合\n",
    "CART_Reg.fit(X_train, y_train)\n",
    "# 模型在测试集上的预测\n",
    "pred = CART_Reg.predict(X_test)\n",
    "# 计算衡量模型好坏的MSE值\n",
    "metrics.mean_squared_error(y_test, pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 构建用于回归的随机森林\n",
    "RF = ensemble.RandomForestRegressor(n_estimators=200, random_state=1234)\n",
    "# 随机森林拟合\n",
    "RF.fit(X_train, y_train)\n",
    "# 模型在测试集上的预测\n",
    "RF_pred = RF.predict(X_test)\n",
    "# 计算模型的MSE值\n",
    "metrics.mean_squared_error(y_test, RF_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 构建变量重要性的序列\n",
    "importance = pd.Series(RF.feature_importances_, index = X_train.columns)\n",
    "# 排序并绘图\n",
    "importance.sort_values().plot('barh')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [Root]",
   "language": "python",
   "name": "Python [Root]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
